This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

library(keras)
library(lime)
library(tidyquant)
library(rsample)
library(recipes)
library(yardstick)
install_keras()

Load the Churn Data

# Load libraries
churn_data_raw <- read_csv("/Users/arun.krishnaswamy/Documents/NeuralNets/BingData.csv")
Parsed with column specification:
cols(
  .default = col_character(),
  SeniorCitizen = col_integer(),
  tenure = col_integer(),
  MonthlyCharges = col_double(),
  TotalCharges = col_double()
)
See spec(...) for full column specifications.
glimpse(churn_data_raw)
Observations: 7,043
Variables: 21
$ customerID       <chr> "7590-VHVEG", "5575-GNVDE", "3668-QPYBK", "7795-CFOCW", "9237-HQ...
$ gender           <chr> "Female", "Male", "Male", "Male", "Female", "Female", "Male", "F...
$ SeniorCitizen    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0...
$ Partner          <chr> "Yes", "No", "No", "No", "No", "No", "No", "No", "Yes", "No", "Y...
$ Dependents       <chr> "No", "No", "No", "No", "No", "No", "Yes", "No", "No", "Yes", "Y...
$ tenure           <int> 1, 34, 2, 45, 2, 8, 22, 10, 28, 62, 13, 16, 58, 49, 25, 69, 52, ...
$ PhoneService     <chr> "No", "Yes", "Yes", "No", "Yes", "Yes", "Yes", "No", "Yes", "Yes...
$ MultipleLines    <chr> "No phone service", "No", "No", "No phone service", "No", "Yes",...
$ InternetService  <chr> "DSL", "DSL", "DSL", "DSL", "Fiber optic", "Fiber optic", "Fiber...
$ OnlineSecurity   <chr> "No", "Yes", "Yes", "Yes", "No", "No", "No", "Yes", "No", "Yes",...
$ OnlineBackup     <chr> "Yes", "No", "Yes", "No", "No", "No", "Yes", "No", "No", "Yes", ...
$ DeviceProtection <chr> "No", "Yes", "No", "Yes", "No", "Yes", "No", "No", "Yes", "No", ...
$ TechSupport      <chr> "No", "No", "No", "Yes", "No", "No", "No", "No", "Yes", "No", "N...
$ StreamingTV      <chr> "No", "No", "No", "No", "No", "Yes", "Yes", "No", "Yes", "No", "...
$ StreamingMovies  <chr> "No", "No", "No", "No", "No", "Yes", "No", "No", "Yes", "No", "N...
$ Contract         <chr> "Month-to-month", "One year", "Month-to-month", "One year", "Mon...
$ PaperlessBilling <chr> "Yes", "No", "Yes", "No", "Yes", "Yes", "Yes", "No", "Yes", "No"...
$ PaymentMethod    <chr> "Electronic check", "Mailed check", "Mailed check", "Bank transf...
$ MonthlyCharges   <dbl> 29.85, 56.95, 53.85, 42.30, 70.70, 99.65, 89.10, 29.75, 104.80, ...
$ TotalCharges     <dbl> 29.85, 1889.50, 108.15, 1840.75, 151.65, 820.50, 1949.40, 301.90...
$ Churn            <chr> "No", "No", "Yes", "No", "Yes", "Yes", "No", "No", "Yes", "No", ...

Remove unnecessary Data

# Remove unnecessary data
churn_data_tbl <- churn_data_raw %>%
    select(-customerID) %>%
    drop_na() %>%
    select(Churn, everything())
    
glimpse(churn_data_tbl)
Observations: 7,032
Variables: 20
$ Churn            <chr> "No", "No", "Yes", "No", "Yes", "Yes", "No", "No", "Yes", "No", ...
$ gender           <chr> "Female", "Male", "Male", "Male", "Female", "Female", "Male", "F...
$ SeniorCitizen    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0...
$ Partner          <chr> "Yes", "No", "No", "No", "No", "No", "No", "No", "Yes", "No", "Y...
$ Dependents       <chr> "No", "No", "No", "No", "No", "No", "Yes", "No", "No", "Yes", "Y...
$ tenure           <int> 1, 34, 2, 45, 2, 8, 22, 10, 28, 62, 13, 16, 58, 49, 25, 69, 52, ...
$ PhoneService     <chr> "No", "Yes", "Yes", "No", "Yes", "Yes", "Yes", "No", "Yes", "Yes...
$ MultipleLines    <chr> "No phone service", "No", "No", "No phone service", "No", "Yes",...
$ InternetService  <chr> "DSL", "DSL", "DSL", "DSL", "Fiber optic", "Fiber optic", "Fiber...
$ OnlineSecurity   <chr> "No", "Yes", "Yes", "Yes", "No", "No", "No", "Yes", "No", "Yes",...
$ OnlineBackup     <chr> "Yes", "No", "Yes", "No", "No", "No", "Yes", "No", "No", "Yes", ...
$ DeviceProtection <chr> "No", "Yes", "No", "Yes", "No", "Yes", "No", "No", "Yes", "No", ...
$ TechSupport      <chr> "No", "No", "No", "Yes", "No", "No", "No", "No", "Yes", "No", "N...
$ StreamingTV      <chr> "No", "No", "No", "No", "No", "Yes", "Yes", "No", "Yes", "No", "...
$ StreamingMovies  <chr> "No", "No", "No", "No", "No", "Yes", "No", "No", "Yes", "No", "N...
$ Contract         <chr> "Month-to-month", "One year", "Month-to-month", "One year", "Mon...
$ PaperlessBilling <chr> "Yes", "No", "Yes", "No", "Yes", "Yes", "Yes", "No", "Yes", "No"...
$ PaymentMethod    <chr> "Electronic check", "Mailed check", "Mailed check", "Bank transf...
$ MonthlyCharges   <dbl> 29.85, 56.95, 53.85, 42.30, 70.70, 99.65, 89.10, 29.75, 104.80, ...
$ TotalCharges     <dbl> 29.85, 1889.50, 108.15, 1840.75, 151.65, 820.50, 1949.40, 301.90...
# Split test/training sets
set.seed(100)
train_test_split <- initial_split(churn_data_tbl, prop = 0.8)
train_test_split
<5626/1406/7032>
# Retrieve train and test sets
train_tbl <- training(train_test_split)
test_tbl  <- testing(train_test_split) 

Artificial Neural Networks are best when the data is one-hot encoded, scaled and centered

We do 4 transformations :

  1. log tranform of TotalCharges - This will make the distribution of TotalCharges less skewed and more centered [ Scale ]
  2. Bin tenure into 6 bins/cohorts/groups - this should help the ML algorithm detect if a group is more/less susceptible to customer churn.[ Center]
  3. One-hot encoding is the process of converting categorical data to sparse data, which has columns of only zeros and ones [one-hot]
  4. Feature Scaling

We will use recipe package for doing the transformations.

# Determine if log transformation improves correlation 
# between TotalCharges and Churn
train_tbl %>%
    select(Churn, TotalCharges) %>%
    mutate(
        Churn = Churn %>% as.factor() %>% as.numeric(),
        LogTotalCharges = log(TotalCharges)
        ) %>%
    correlate() %>%
    focus(Churn) %>%
    fashion()

RECIPE

  1. step_discretize() with the option = list(cuts = 6) to cut the continuous variable for “tenure” (number of years as a customer) to group customers into cohorts.
  2. step_log() to log transform “TotalCharges”.
  3. step_dummy() to one-hot encode the categorical data. Note that this adds columns of one/zero for categorical data with three or more categories.
  4. step_center() to mean-center the data.
  5. step_scale() to scale the data.
# Create recipe
rec_obj <- recipe(Churn ~ ., data = train_tbl) %>%
    step_discretize(tenure, options = list(cuts = 6)) %>%
    step_log(TotalCharges) %>%
    step_dummy(all_nominal(), -all_outcomes()) %>%
    step_center(all_predictors(), -all_outcomes()) %>%
    step_scale(all_predictors(), -all_outcomes()) %>%
    prep(data = train_tbl)
# Print the recipe object
rec_obj
Data Recipe

Inputs:

      role #variables
   outcome          1
 predictor         19

Training data contained 5626 data points and no missing data.

Operations:

Dummy variables from tenure [trained]
Log transformation on TotalCharges [trained]
Dummy variables from gender, Partner, Dependents, tenure, PhoneService, ... [trained]
Centering for SeniorCitizen, MonthlyCharges, TotalCharges, ... [trained]
Scaling for SeniorCitizen, MonthlyCharges, TotalCharges, ... [trained]

BAKING WITH YOUR RECIPE

We can apply the “recipe” to any data set with the bake() function, and it processes the data following the recipe steps. We’ll apply to our training and testing data to convert from raw data to a machine learning dataset

# Predictors
x_train_tbl <- bake(rec_obj, newdata = train_tbl) %>% select(-Churn)
x_test_tbl <- bake(rec_obj, newdata = test_tbl) %>% select(-Churn)
glimpse(x_train_tbl)
Observations: 5,626
Variables: 35
$ SeniorCitizen                         <dbl> -0.4351959, -0.4351959, -0.4351959, -0.4351...
$ MonthlyCharges                        <dbl> -1.157597168, -0.260133339, -0.745293674, 0...
$ TotalCharges                          <dbl> -2.275819130, 0.389259098, 0.372464296, -1....
$ gender_Male                           <dbl> -1.0016900, 0.9981354, 0.9981354, -1.001690...
$ Partner_Yes                           <dbl> 1.0262054, -0.9742906, -0.9742906, -0.97429...
$ Dependents_Yes                        <dbl> -0.6507747, -0.6507747, -0.6507747, -0.6507...
$ tenure_bin1                           <dbl> 2.1677790, -0.4612196, -0.4612196, 2.167779...
$ tenure_bin2                           <dbl> -0.4389453, -0.4389453, -0.4389453, -0.4389...
$ tenure_bin3                           <dbl> -0.4481273, -0.4481273, -0.4481273, -0.4481...
$ tenure_bin4                           <dbl> -0.4509837, 2.2169809, 2.2169809, -0.450983...
$ tenure_bin5                           <dbl> -0.4498419, -0.4498419, -0.4498419, -0.4498...
$ tenure_bin6                           <dbl> -0.4337508, -0.4337508, -0.4337508, -0.4337...
$ PhoneService_Yes                      <dbl> -3.0407367, 0.3288092, -3.0407367, 0.328809...
$ MultipleLines_No.phone.service        <dbl> 3.0407367, -0.3288092, 3.0407367, -0.328809...
$ MultipleLines_Yes                     <dbl> -0.8571364, -0.8571364, -0.8571364, -0.8571...
$ InternetService_Fiber.optic           <dbl> -0.8884255, -0.8884255, -0.8884255, 1.12538...
$ InternetService_No                    <dbl> -0.5272627, -0.5272627, -0.5272627, -0.5272...
$ OnlineSecurity_No.internet.service    <dbl> -0.5272627, -0.5272627, -0.5272627, -0.5272...
$ OnlineSecurity_Yes                    <dbl> -0.6369654, 1.5696648, 1.5696648, -0.636965...
$ OnlineBackup_No.internet.service      <dbl> -0.5272627, -0.5272627, -0.5272627, -0.5272...
$ OnlineBackup_Yes                      <dbl> 1.3771987, -0.7259826, -0.7259826, -0.72598...
$ DeviceProtection_No.internet.service  <dbl> -0.5272627, -0.5272627, -0.5272627, -0.5272...
$ DeviceProtection_Yes                  <dbl> -0.7259826, 1.3771987, 1.3771987, -0.725982...
$ TechSupport_No.internet.service       <dbl> -0.5272627, -0.5272627, -0.5272627, -0.5272...
$ TechSupport_Yes                       <dbl> -0.6358628, -0.6358628, 1.5723867, -0.63586...
$ StreamingTV_No.internet.service       <dbl> -0.5272627, -0.5272627, -0.5272627, -0.5272...
$ StreamingTV_Yes                       <dbl> -0.7917326, -0.7917326, -0.7917326, -0.7917...
$ StreamingMovies_No.internet.service   <dbl> -0.5272627, -0.5272627, -0.5272627, -0.5272...
$ StreamingMovies_Yes                   <dbl> -0.797388, -0.797388, -0.797388, -0.797388,...
$ Contract_One.year                     <dbl> -0.5156834, 1.9388298, 1.9388298, -0.515683...
$ Contract_Two.year                     <dbl> -0.5618358, -0.5618358, -0.5618358, -0.5618...
$ PaperlessBilling_Yes                  <dbl> 0.8330334, -1.2002187, -1.2002187, 0.833033...
$ PaymentMethod_Credit.card..automatic. <dbl> -0.5231315, -0.5231315, -0.5231315, -0.5231...
$ PaymentMethod_Electronic.check        <dbl> 1.4154085, -0.7063842, -0.7063842, 1.415408...
$ PaymentMethod_Mailed.check            <dbl> -0.5517013, 1.8122527, -0.5517013, -0.55170...

THE TARGET

One last step, we need to store the actual values (truth) as y_train_vec and y_test_vec, which are needed for modeling our ANN. We convert to a series of numeric ones and zeros which can be accepted by the Keras ANN modeling functions

# Response variables for training and testing sets
y_train_vec <- ifelse(pull(train_tbl, Churn) == "Yes", 1, 0)
y_test_vec  <- ifelse(pull(test_tbl, Churn) == "Yes", 1, 0)

We’ll build a three layer MLP with Keras#.

Initialize a sequential model: The first step is to initialize a sequential model with keras_model_sequential(), which is the beginning of our Keras model. The sequential model is composed of a linear stack of layers.

Apply layers to the sequential model: Layers consist of the input layer, hidden layers and an output layer. The input layer is the data and provided it’s formatted correctly there’s nothing more to discuss. The hidden layers and output layers are what controls the ANN inner workings.

Hidden Layers: Hidden layers form the neural network nodes that enable non-linear activation using weights. The hidden layers are created using layer_dense().

We’ll add two hidden layers. We’ll apply units = 16, which is the number of nodes. We’ll select kernel_initializer = “uniform” and activation = “relu” for both layers. The first layer needs to have the input_shape = 35, which is the number of columns in the training set.

Dropout Layers: Dropout layers are used to control overfitting. This eliminates weights below a cutoff threshold to prevent low weights from overfitting the layers. We use the layer_dropout() function add two drop out layers with rate = 0.10 to remove weights below 10%.

Output Layer: The output layer specifies the shape of the output and the method of assimilating the learned information. The output layer is applied using the layer_dense().

For binary values, the shape should be units = 1. For multi-classification, the units should correspond to the number of classes. We set the kernel_initializer = “uniform” and the activation = “sigmoid” (common for binary classification).

Compile the model: The last step is to compile the model with compile().

We’ll use optimizer = “adam”, which is one of the most popular optimization algorithms. We select loss = “binary_crossentropy” since this is a binary classification problem.

We’ll select metrics = c(“accuracy”) to be evaluated during training and testing.

# Building our Artificial Neural Network
model_keras <- keras_model_sequential()
model_keras %>% 
    # First hidden layer
    layer_dense(
        units              = 16, 
        kernel_initializer = "uniform", 
        activation         = "relu", 
        input_shape        = ncol(x_train_tbl)) %>% 
    # Dropout to prevent overfitting
    layer_dropout(rate = 0.1) %>%
    # Second hidden layer
    layer_dense(
        units              = 16, 
        kernel_initializer = "uniform", 
        activation         = "relu") %>% 
    # Dropout to prevent overfitting
    layer_dropout(rate = 0.1) %>%
    # Output layer
    layer_dense(
        units              = 1, 
        kernel_initializer = "uniform", 
        activation         = "sigmoid") %>% 
    # Compile ANN
    compile(
        optimizer = 'adam',
        loss      = 'binary_crossentropy',
        metrics   = c('accuracy')
    )
model_keras
Model
______________________________________________________________________________________________
Layer (type)                              Output Shape                         Param #        
==============================================================================================
dense_4 (Dense)                           (None, 16)                           576            
______________________________________________________________________________________________
dropout_3 (Dropout)                       (None, 16)                           0              
______________________________________________________________________________________________
dense_5 (Dense)                           (None, 16)                           272            
______________________________________________________________________________________________
dropout_4 (Dropout)                       (None, 16)                           0              
______________________________________________________________________________________________
dense_6 (Dense)                           (None, 1)                            17             
==============================================================================================
Total params: 865
Trainable params: 865
Non-trainable params: 0
______________________________________________________________________________________________

Fitting the Model

We use the fit() function to run the ANN on our training data. The object is our model, and x and y are our training data in matrix and numeric vector forms, respectively. batch_size = 50 sets the number samples per gradient update within each epoch. epochs = 35 to control the number training cycles. validation_split = 0.30 to include 30% of the data for model validation, which prevents overfitting.

# Fit the keras model to the training data
fit_keras <- fit(
    object           = model_keras, 
    x                = as.matrix(x_train_tbl), 
    y                = y_train_vec,
    batch_size       = 50, 
    epochs           = 35,
    validation_split = 0.30
    )
Train on 3938 samples, validate on 1688 samples
Epoch 1/35

  50/3938 [..............................] - ETA: 25s - loss: 0.6933 - acc: 0.4400
1000/3938 [======>.......................] - ETA: 1s - loss: 0.6894 - acc: 0.7000 
1750/3938 [============>.................] - ETA: 0s - loss: 0.6830 - acc: 0.7131
2750/3938 [===================>..........] - ETA: 0s - loss: 0.6649 - acc: 0.7185
3150/3938 [======================>.......] - ETA: 0s - loss: 0.6548 - acc: 0.7184
3938/3938 [==============================] - 1s 174us/step - loss: 0.6320 - acc: 0.7245 - val_loss: 0.5234 - val_acc: 0.7322
Epoch 2/35

  50/3938 [..............................] - ETA: 0s - loss: 0.5095 - acc: 0.7400
1050/3938 [======>.......................] - ETA: 0s - loss: 0.4824 - acc: 0.7390
1900/3938 [=============>................] - ETA: 0s - loss: 0.4676 - acc: 0.7400
3150/3938 [======================>.......] - ETA: 0s - loss: 0.4692 - acc: 0.7352
3400/3938 [========================>.....] - ETA: 0s - loss: 0.4654 - acc: 0.7359
3938/3938 [==============================] - 0s 64us/step - loss: 0.4681 - acc: 0.7308 - val_loss: 0.4585 - val_acc: 0.7322
Epoch 3/35

  50/3938 [..............................] - ETA: 0s - loss: 0.3969 - acc: 0.8400
 750/3938 [====>.........................] - ETA: 0s - loss: 0.4365 - acc: 0.7267
2000/3938 [==============>...............] - ETA: 0s - loss: 0.4401 - acc: 0.7370
3200/3938 [=======================>......] - ETA: 0s - loss: 0.4346 - acc: 0.7372
3938/3938 [==============================] - 0s 54us/step - loss: 0.4446 - acc: 0.7308 - val_loss: 0.4511 - val_acc: 0.7322
Epoch 4/35

  50/3938 [..............................] - ETA: 0s - loss: 0.4352 - acc: 0.7400
1450/3938 [==========>...................] - ETA: 0s - loss: 0.4484 - acc: 0.7255
2400/3938 [=================>............] - ETA: 0s - loss: 0.4453 - acc: 0.7333
3900/3938 [============================>.] - ETA: 0s - loss: 0.4413 - acc: 0.7300
3938/3938 [==============================] - 0s 45us/step - loss: 0.4399 - acc: 0.7308 - val_loss: 0.4474 - val_acc: 0.7322
Epoch 5/35

  50/3938 [..............................] - ETA: 0s - loss: 0.4038 - acc: 0.6800
 950/3938 [======>.......................] - ETA: 0s - loss: 0.4447 - acc: 0.7242
2450/3938 [=================>............] - ETA: 0s - loss: 0.4368 - acc: 0.7294
3850/3938 [============================>.] - ETA: 0s - loss: 0.4342 - acc: 0.7314
3938/3938 [==============================] - 0s 57us/step - loss: 0.4341 - acc: 0.7308 - val_loss: 0.4453 - val_acc: 0.7322
Epoch 6/35

  50/3938 [..............................] - ETA: 0s - loss: 0.3938 - acc: 0.7600
1400/3938 [=========>....................] - ETA: 0s - loss: 0.4228 - acc: 0.7971
2250/3938 [================>.............] - ETA: 0s - loss: 0.4180 - acc: 0.7951
3450/3938 [=========================>....] - ETA: 0s - loss: 0.4271 - acc: 0.7957
3938/3938 [==============================] - 0s 57us/step - loss: 0.4329 - acc: 0.7925 - val_loss: 0.4440 - val_acc: 0.8021
Epoch 7/35

  50/3938 [..............................] - ETA: 0s - loss: 0.3628 - acc: 0.8200
1500/3938 [==========>...................] - ETA: 0s - loss: 0.4398 - acc: 0.7933
3000/3938 [=====================>........] - ETA: 0s - loss: 0.4344 - acc: 0.7950
3938/3938 [==============================] - 0s 43us/step - loss: 0.4308 - acc: 0.7969 - val_loss: 0.4441 - val_acc: 0.8004
Epoch 8/35

  50/3938 [..............................] - ETA: 0s - loss: 0.3850 - acc: 0.8200
1150/3938 [=======>......................] - ETA: 0s - loss: 0.4124 - acc: 0.8017
2250/3938 [================>.............] - ETA: 0s - loss: 0.4349 - acc: 0.7960
3800/3938 [===========================>..] - ETA: 0s - loss: 0.4290 - acc: 0.8005
3938/3938 [==============================] - 0s 49us/step - loss: 0.4282 - acc: 0.8017 - val_loss: 0.4427 - val_acc: 0.8015
Epoch 9/35

  50/3938 [..............................] - ETA: 0s - loss: 0.3321 - acc: 0.8600
1300/3938 [========>.....................] - ETA: 0s - loss: 0.4085 - acc: 0.8146
2650/3938 [===================>..........] - ETA: 0s - loss: 0.4156 - acc: 0.8060
3938/3938 [==============================] - 0s 45us/step - loss: 0.4261 - acc: 0.8040 - val_loss: 0.4405 - val_acc: 0.8021
Epoch 10/35

  50/3938 [..............................] - ETA: 0s - loss: 0.3550 - acc: 0.8600
1300/3938 [========>.....................] - ETA: 0s - loss: 0.4246 - acc: 0.8031
2450/3938 [=================>............] - ETA: 0s - loss: 0.4210 - acc: 0.8127
3350/3938 [========================>.....] - ETA: 0s - loss: 0.4229 - acc: 0.8078
3938/3938 [==============================] - 0s 49us/step - loss: 0.4251 - acc: 0.8075 - val_loss: 0.4404 - val_acc: 0.8021
Epoch 11/35

  50/3938 [..............................] - ETA: 0s - loss: 0.4570 - acc: 0.7200
1450/3938 [==========>...................] - ETA: 0s - loss: 0.4326 - acc: 0.7952
2950/3938 [=====================>........] - ETA: 0s - loss: 0.4198 - acc: 0.8115
3938/3938 [==============================] - 0s 40us/step - loss: 0.4239 - acc: 0.8075 - val_loss: 0.4399 - val_acc: 0.7986
Epoch 12/35

  50/3938 [..............................] - ETA: 0s - loss: 0.3855 - acc: 0.7200
1300/3938 [========>.....................] - ETA: 0s - loss: 0.4251 - acc: 0.7962
2950/3938 [=====================>........] - ETA: 0s - loss: 0.4282 - acc: 0.8051
3938/3938 [==============================] - 0s 43us/step - loss: 0.4252 - acc: 0.8052 - val_loss: 0.4381 - val_acc: 0.7986
Epoch 13/35

  50/3938 [..............................] - ETA: 0s - loss: 0.3745 - acc: 0.7800
1600/3938 [===========>..................] - ETA: 0s - loss: 0.4313 - acc: 0.8069
3050/3938 [======================>.......] - ETA: 0s - loss: 0.4217 - acc: 0.8079
3938/3938 [==============================] - 0s 40us/step - loss: 0.4222 - acc: 0.8085 - val_loss: 0.4402 - val_acc: 0.8045
Epoch 14/35

  50/3938 [..............................] - ETA: 0s - loss: 0.3136 - acc: 0.8800
1400/3938 [=========>....................] - ETA: 0s - loss: 0.4354 - acc: 0.7986
2400/3938 [=================>............] - ETA: 0s - loss: 0.4305 - acc: 0.8025
3650/3938 [==========================>...] - ETA: 0s - loss: 0.4192 - acc: 0.8074
3938/3938 [==============================] - 0s 47us/step - loss: 0.4191 - acc: 0.8062 - val_loss: 0.4387 - val_acc: 0.7974
Epoch 15/35

  50/3938 [..............................] - ETA: 0s - loss: 0.3775 - acc: 0.8400
1300/3938 [========>.....................] - ETA: 0s - loss: 0.4209 - acc: 0.7946
2850/3938 [====================>.........] - ETA: 0s - loss: 0.4193 - acc: 0.8060
3938/3938 [==============================] - 0s 41us/step - loss: 0.4192 - acc: 0.8029 - val_loss: 0.4381 - val_acc: 0.7968
Epoch 16/35

  50/3938 [..............................] - ETA: 0s - loss: 0.5004 - acc: 0.8600
1300/3938 [========>.....................] - ETA: 0s - loss: 0.4209 - acc: 0.8215
2950/3938 [=====================>........] - ETA: 0s - loss: 0.4199 - acc: 0.8078
3938/3938 [==============================] - 0s 42us/step - loss: 0.4206 - acc: 0.8098 - val_loss: 0.4368 - val_acc: 0.7986
Epoch 17/35

  50/3938 [..............................] - ETA: 0s - loss: 0.4374 - acc: 0.8000
1450/3938 [==========>...................] - ETA: 0s - loss: 0.4328 - acc: 0.8028
2600/3938 [==================>...........] - ETA: 0s - loss: 0.4176 - acc: 0.8088
3800/3938 [===========================>..] - ETA: 0s - loss: 0.4210 - acc: 0.8089
3938/3938 [==============================] - 0s 46us/step - loss: 0.4207 - acc: 0.8080 - val_loss: 0.4375 - val_acc: 0.8027
Epoch 18/35

  50/3938 [..............................] - ETA: 0s - loss: 0.3658 - acc: 0.8400
1500/3938 [==========>...................] - ETA: 0s - loss: 0.4123 - acc: 0.8100
2750/3938 [===================>..........] - ETA: 0s - loss: 0.4077 - acc: 0.8095
3938/3938 [==============================] - 0s 40us/step - loss: 0.4141 - acc: 0.8088 - val_loss: 0.4363 - val_acc: 0.8009
Epoch 19/35

  50/3938 [..............................] - ETA: 0s - loss: 0.3844 - acc: 0.8200
1300/3938 [========>.....................] - ETA: 0s - loss: 0.4269 - acc: 0.8077
2800/3938 [====================>.........] - ETA: 0s - loss: 0.4302 - acc: 0.8029
3938/3938 [==============================] - 0s 42us/step - loss: 0.4168 - acc: 0.8083 - val_loss: 0.4363 - val_acc: 0.7992
Epoch 20/35

  50/3938 [..............................] - ETA: 0s - loss: 0.3480 - acc: 0.8400
1000/3938 [======>.......................] - ETA: 0s - loss: 0.4198 - acc: 0.8040
2650/3938 [===================>..........] - ETA: 0s - loss: 0.4155 - acc: 0.8087
3900/3938 [============================>.] - ETA: 0s - loss: 0.4149 - acc: 0.8090
3938/3938 [==============================] - 0s 45us/step - loss: 0.4153 - acc: 0.8085 - val_loss: 0.4357 - val_acc: 0.8004
Epoch 21/35

  50/3938 [..............................] - ETA: 0s - loss: 0.4538 - acc: 0.7400
1450/3938 [==========>...................] - ETA: 0s - loss: 0.4303 - acc: 0.8062
2500/3938 [==================>...........] - ETA: 0s - loss: 0.4117 - acc: 0.8108
3600/3938 [==========================>...] - ETA: 0s - loss: 0.4113 - acc: 0.8125
3938/3938 [==============================] - 0s 47us/step - loss: 0.4151 - acc: 0.8118 - val_loss: 0.4362 - val_acc: 0.7992
Epoch 22/35

  50/3938 [..............................] - ETA: 0s - loss: 0.4497 - acc: 0.8200
1450/3938 [==========>...................] - ETA: 0s - loss: 0.4246 - acc: 0.8028
2950/3938 [=====================>........] - ETA: 0s - loss: 0.4193 - acc: 0.8003
3938/3938 [==============================] - 0s 40us/step - loss: 0.4124 - acc: 0.8095 - val_loss: 0.4355 - val_acc: 0.7956
Epoch 23/35

  50/3938 [..............................] - ETA: 0s - loss: 0.4950 - acc: 0.7200
1250/3938 [========>.....................] - ETA: 0s - loss: 0.4013 - acc: 0.8184
2850/3938 [====================>.........] - ETA: 0s - loss: 0.4093 - acc: 0.8067
3938/3938 [==============================] - 0s 43us/step - loss: 0.4134 - acc: 0.8083 - val_loss: 0.4354 - val_acc: 0.7944
Epoch 24/35

  50/3938 [..............................] - ETA: 0s - loss: 0.5085 - acc: 0.7600
1200/3938 [========>.....................] - ETA: 0s - loss: 0.4251 - acc: 0.7883
2450/3938 [=================>............] - ETA: 0s - loss: 0.4151 - acc: 0.8020
3250/3938 [=======================>......] - ETA: 0s - loss: 0.4086 - acc: 0.8102
3938/3938 [==============================] - 0s 51us/step - loss: 0.4145 - acc: 0.8065 - val_loss: 0.4355 - val_acc: 0.7956
Epoch 25/35

  50/3938 [..............................] - ETA: 0s - loss: 0.3083 - acc: 0.8400
1500/3938 [==========>...................] - ETA: 0s - loss: 0.4048 - acc: 0.8100
2800/3938 [====================>.........] - ETA: 0s - loss: 0.4055 - acc: 0.8093
3938/3938 [==============================] - 0s 40us/step - loss: 0.4105 - acc: 0.8065 - val_loss: 0.4333 - val_acc: 0.7986
Epoch 26/35

  50/3938 [..............................] - ETA: 0s - loss: 0.3913 - acc: 0.8200
1250/3938 [========>.....................] - ETA: 0s - loss: 0.4038 - acc: 0.8136
2850/3938 [====================>.........] - ETA: 0s - loss: 0.4043 - acc: 0.8088
3938/3938 [==============================] - 0s 42us/step - loss: 0.4133 - acc: 0.8065 - val_loss: 0.4325 - val_acc: 0.7980
Epoch 27/35

  50/3938 [..............................] - ETA: 0s - loss: 0.3662 - acc: 0.8400
1300/3938 [========>.....................] - ETA: 0s - loss: 0.4120 - acc: 0.8131
3000/3938 [=====================>........] - ETA: 0s - loss: 0.4139 - acc: 0.8040
3938/3938 [==============================] - 0s 42us/step - loss: 0.4100 - acc: 0.8065 - val_loss: 0.4321 - val_acc: 0.7986
Epoch 28/35

  50/3938 [..............................] - ETA: 0s - loss: 0.4113 - acc: 0.8400
1450/3938 [==========>...................] - ETA: 0s - loss: 0.4065 - acc: 0.8083
2500/3938 [==================>...........] - ETA: 0s - loss: 0.4138 - acc: 0.8028
3600/3938 [==========================>...] - ETA: 0s - loss: 0.4078 - acc: 0.8108
3938/3938 [==============================] - 0s 48us/step - loss: 0.4086 - acc: 0.8116 - val_loss: 0.4344 - val_acc: 0.8004
Epoch 29/35

  50/3938 [..............................] - ETA: 0s - loss: 0.4814 - acc: 0.7400
1450/3938 [==========>...................] - ETA: 0s - loss: 0.4119 - acc: 0.8076
2900/3938 [=====================>........] - ETA: 0s - loss: 0.4029 - acc: 0.8066
3938/3938 [==============================] - 0s 40us/step - loss: 0.4084 - acc: 0.8065 - val_loss: 0.4341 - val_acc: 0.7956
Epoch 30/35

  50/3938 [..............................] - ETA: 0s - loss: 0.4270 - acc: 0.8400
1150/3938 [=======>......................] - ETA: 0s - loss: 0.4215 - acc: 0.8122
2800/3938 [====================>.........] - ETA: 0s - loss: 0.4112 - acc: 0.8118
3938/3938 [==============================] - 0s 44us/step - loss: 0.4110 - acc: 0.8068 - val_loss: 0.4311 - val_acc: 0.8009
Epoch 31/35

  50/3938 [..............................] - ETA: 0s - loss: 0.4665 - acc: 0.7600
1600/3938 [===========>..................] - ETA: 0s - loss: 0.4220 - acc: 0.8006
3200/3938 [=======================>......] - ETA: 0s - loss: 0.4132 - acc: 0.8091
3938/3938 [==============================] - 0s 41us/step - loss: 0.4087 - acc: 0.8085 - val_loss: 0.4374 - val_acc: 0.8027
Epoch 32/35

  50/3938 [..............................] - ETA: 0s - loss: 0.5455 - acc: 0.8000
1400/3938 [=========>....................] - ETA: 0s - loss: 0.4137 - acc: 0.8207
2300/3938 [================>.............] - ETA: 0s - loss: 0.4116 - acc: 0.8183
3600/3938 [==========================>...] - ETA: 0s - loss: 0.4042 - acc: 0.8128
3938/3938 [==============================] - 0s 47us/step - loss: 0.4072 - acc: 0.8098 - val_loss: 0.4324 - val_acc: 0.7986
Epoch 33/35

  50/3938 [..............................] - ETA: 0s - loss: 0.2979 - acc: 0.8800
1250/3938 [========>.....................] - ETA: 0s - loss: 0.3937 - acc: 0.8032
2700/3938 [===================>..........] - ETA: 0s - loss: 0.3999 - acc: 0.8081
3938/3938 [==============================] - 0s 42us/step - loss: 0.4064 - acc: 0.8050 - val_loss: 0.4318 - val_acc: 0.8015
Epoch 34/35

  50/3938 [..............................] - ETA: 0s - loss: 0.3987 - acc: 0.8200
1250/3938 [========>.....................] - ETA: 0s - loss: 0.4076 - acc: 0.8040
2900/3938 [=====================>........] - ETA: 0s - loss: 0.4077 - acc: 0.8093
3938/3938 [==============================] - 0s 42us/step - loss: 0.4020 - acc: 0.8106 - val_loss: 0.4324 - val_acc: 0.8009
Epoch 35/35

  50/3938 [..............................] - ETA: 0s - loss: 0.4352 - acc: 0.7800
1300/3938 [========>.....................] - ETA: 0s - loss: 0.4188 - acc: 0.8038
2750/3938 [===================>..........] - ETA: 0s - loss: 0.4109 - acc: 0.8033
3938/3938 [==============================] - 0s 46us/step - loss: 0.4024 - acc: 0.8093 - val_loss: 0.4327 - val_acc: 0.7986
# Print the final model
fit_keras
# Plot the training/validation history of our Keras model
plot(fit_keras) +
    theme_tq() +
    scale_color_tq() +
    scale_fill_tq() +
    labs(title = "Neural Net Training Results")

We’ve got a good model based on the validation accuracy. Now let’s make some predictions from our keras model on the test data set, which was unseen during modeling (we use this for the true performance assessment). We have two functions to generate predictions:

predict_classes: Generates class values as a matrix of ones and zeros. Since we are dealing with binary classification, we’ll convert the output to a vector. predict_proba: Generates the class probabilities as a numeric matrix indicating the probability of being a class. Again, we convert to a numeric vector because there is only one column output.

# Predicted Class
yhat_keras_class_vec <- predict_classes(object = model_keras, x = as.matrix(x_test_tbl)) %>%
    as.vector()
# Predicted Class Probability
yhat_keras_prob_vec  <- predict_proba(object = model_keras, x = as.matrix(x_test_tbl)) %>%
    as.vector()

Yardstick Package

# Format test data and predictions for yardstick metrics
estimates_keras_tbl <- tibble(
    truth      = as.factor(y_test_vec) %>% fct_recode(yes = "1", no = "0"),
    estimate   = as.factor(yhat_keras_class_vec) %>% fct_recode(yes = "1", no = "0"),
    class_prob = yhat_keras_prob_vec
)
estimates_keras_tbl
options(yardstick.event_first = FALSE)
# Confusion Table
estimates_keras_tbl %>% conf_mat(truth, estimate)
          Truth
Prediction  no yes
       no  940 155
       yes 109 202
# Accuracy
estimates_keras_tbl %>% metrics(truth, estimate)
# AUC
estimates_keras_tbl %>% roc_auc(truth, class_prob)
[1] 0.8502816
# Precision and Recall
tibble(
    precision = estimates_keras_tbl %>% precision(truth, estimate),
    recall    = estimates_keras_tbl %>% recall(truth, estimate)
)

Visualization of the MODEL WITH LIME

LIME stands for Local Interpretable Model-agnostic Explanations, and is a method for explaining black-box machine learning model classifiers.

class(model_keras)
[1] "keras.models.Sequential"         "keras.engine.training.Model"    
[3] "keras.engine.topology.Container" "keras.engine.topology.Layer"    
[5] "python.builtin.object"          
# Setup lime::model_type() function for keras
model_type.keras.models.Sequential <- function(x, ...) {
    return("classification")
}
# Setup lime::predict_model() function for keras
predict_model.keras.models.Sequential <- function(x, newdata, type, ...) {
    pred <- predict_proba(object = x, x = as.matrix(newdata))
    return(data.frame(Yes = pred, No = 1 - pred))
}
# Test our predict_model() function
predict_model(x = model_keras, newdata = x_test_tbl, type = 'raw') %>%
    tibble::as_tibble()
# Run lime() on training set
explainer <- lime::lime(
    x              = x_train_tbl, 
    model          = model_keras, 
    bin_continuous = FALSE)
# Run explain() on explainer
explanation <- lime::explain(
    x_test_tbl[1:10,], 
    explainer    = explainer, 
    n_labels     = 1, 
    n_features   = 4,
    kernel_width = 0.5)

FEATURE IMPORTANCE VISUALIZATION

The payoff for the work we put in using LIME is this feature importance plot. This allows us to visualize each of the first ten cases (observations) from the test data. The top four features for each case are shown. Note that they are not the same for each case. The green bars mean that the feature supports the model conclusion, and the red bars contradict. A few important features based on frequency in first ten cases:

Tenure (7 cases)

Senior Citizen (5 cases)

Online Security (4 cases)

plot_features(explanation) +
    labs(title = "LIME Feature Importance Visualization",
         subtitle = "Hold Out (Test) Set, First 10 Cases Shown")

plot_explanations(explanation) +
    labs(title = "LIME Feature Importance Heatmap",
         subtitle = "Hold Out (Test) Set, First 10 Cases Shown")

Feature Investigation - Visualization withj LIME

We can investigate features that are most frequent in the LIME feature importance visualization :

Tenure (7/10 LIME Cases, Highly Correlated)

Contract (Highly Correlated)

Internet Service (Highly Correlated)

Payment Method (Highly Correlated)

Senior Citizen (5/10 LIME Cases)

Online Security (4/10 LIME Cases)

# Tenure
churn_data_raw %>%
ggplot(aes(x = Churn, y = tenure)) +
geom_jitter(alpha = 0.25, color = palette_light()[[6]]) +
geom_violin(alpha = 0.6, fill = palette_light()[[1]]) +
theme_tq() +
labs(
title = "Tenure",
subtitle = "Customers with lower tenure are more likely to leave"
)

# Contract
churn_data_raw %>%
mutate(Churn = ifelse(Churn == "Yes", 1, 0)) %>%
ggplot(aes(x = as.factor(Contract), y = Churn)) +
geom_jitter(alpha = 0.25, color = palette_light()[[6]]) +
geom_violin(alpha = 0.6, fill = palette_light()[[1]]) +
theme_tq() +
labs(
title = "Contract Type",
subtitle = "Two and one year contracts much less likely to leave",
x = "Contract"
)

# Internet Service
churn_data_raw %>%
mutate(Churn = ifelse(Churn == "Yes", 1, 0)) %>%
ggplot(aes(x = as.factor(InternetService), y = Churn)) +
geom_jitter(alpha = 0.25, color = palette_light()[[6]]) +
geom_violin(alpha = 0.6, fill = palette_light()[[1]]) +
theme_tq() +
labs(
title = "Internet Service",
subtitle = "Fiber optic more likely to leave",
x = "Internet Service"
)

# Payment Method
churn_data_raw %>%
mutate(Churn = ifelse(Churn == "Yes", 1, 0)) %>%
ggplot(aes(x = as.factor(PaymentMethod), y = Churn)) +
geom_jitter(alpha = 0.25, color = palette_light()[[6]]) +
geom_violin(alpha = 0.6, fill = palette_light()[[1]]) +
theme_tq() +
labs(
title = "Payment Method",
subtitle = "Electronic check more likely to leave",
x = "Payment Method"
)

# Senior Citizen
churn_data_raw %>%
mutate(Churn = ifelse(Churn == "Yes", 1, 0)) %>%
ggplot(aes(x = as.factor(SeniorCitizen), y = Churn)) +
geom_jitter(alpha = 0.25, color = palette_light()[[6]]) +
geom_violin(alpha = 0.6, fill = palette_light()[[1]]) +
theme_tq() +
labs(
title = "Senior Citizen",
subtitle = "Non-senior citizens less likely to leave",
x = "Senior Citizen (Yes = 1)"
)

# Online Security
churn_data_raw %>%
mutate(Churn = ifelse(Churn == "Yes", 1, 0)) %>%
ggplot(aes(x = OnlineSecurity, y = Churn)) +
geom_jitter(alpha = 0.25, color = palette_light()[[6]]) +
geom_violin(alpha = 0.6, fill = palette_light()[[1]]) +
theme_tq() +
labs(
title = "Online Security",
subtitle = "Customers without online security are more likely to leave"
)

---
title: "Bing Churn Data"
output: html_notebook
---

This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. 

Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Cmd+Shift+Enter*. 

```{r}
library(keras)
library(lime)
library(tidyquant)
library(rsample)
library(recipes)
library(yardstick)
install_keras()
```

# Load the Churn Data 

```{r}
# Load libraries
churn_data_raw <- read_csv("/Users/arun.krishnaswamy/Documents/NeuralNets/BingData.csv")
glimpse(churn_data_raw)
```

# Remove unnecessary Data 

```{r}
# Remove unnecessary data
churn_data_tbl <- churn_data_raw %>%
    select(-customerID) %>%
    drop_na() %>%
    select(Churn, everything())
    
glimpse(churn_data_tbl)
```

```{r}
# Split test/training sets
set.seed(100)
train_test_split <- initial_split(churn_data_tbl, prop = 0.8)
train_test_split
```

```{r}
# Retrieve train and test sets
train_tbl <- training(train_test_split)
test_tbl  <- testing(train_test_split) 
```

#Artificial Neural Networks are best when the data is one-hot encoded, scaled and centered

We do 4 transformations :

1. *log tranform of TotalCharges*   - This will make the distribution of TotalCharges less skewed and more centered  [ Scale ]
2. *Bin tenure into 6 bins/cohorts/groups* - this should help the ML algorithm detect if a group is more/less susceptible to customer churn.[ Center]
3. *One-hot encoding* is the process of converting categorical data to sparse data, which has columns of only zeros and ones [one-hot]
4. *Feature Scaling* 

We will use recipe package for doing the transformations.

```{r}
# Determine if log transformation improves correlation 
# between TotalCharges and Churn
train_tbl %>%
    select(Churn, TotalCharges) %>%
    mutate(
        Churn = Churn %>% as.factor() %>% as.numeric(),
        LogTotalCharges = log(TotalCharges)
        ) %>%
    correlate() %>%
    focus(Churn) %>%
    fashion()
```

#RECIPE

1. *step_discretize()* with the option = list(cuts = 6) to cut the continuous variable for “tenure” (number of years as a customer) to group customers into cohorts.
2. *step_log()* to log transform “TotalCharges”.
3. *step_dummy()* to one-hot encode the categorical data. Note that this adds columns of one/zero for categorical data with three or more categories.
4. *step_center()* to mean-center the data.
5. *step_scale()* to scale the data.

```{r}
# Create recipe
rec_obj <- recipe(Churn ~ ., data = train_tbl) %>%
    step_discretize(tenure, options = list(cuts = 6)) %>%
    step_log(TotalCharges) %>%
    step_dummy(all_nominal(), -all_outcomes()) %>%
    step_center(all_predictors(), -all_outcomes()) %>%
    step_scale(all_predictors(), -all_outcomes()) %>%
    prep(data = train_tbl)
```

```{r}
# Print the recipe object
rec_obj
```

#BAKING WITH YOUR RECIPE

We can apply the “recipe” to any data set with the bake() function, and it processes the data following the recipe steps. 
We’ll apply to our training and testing data to convert from raw data to a machine learning dataset

```{r}
# Predictors
x_train_tbl <- bake(rec_obj, newdata = train_tbl) %>% select(-Churn)
x_test_tbl <- bake(rec_obj, newdata = test_tbl) %>% select(-Churn)

glimpse(x_train_tbl)
```


#THE TARGET

One last step, we need to store the actual values (truth) as y_train_vec and y_test_vec, which are needed for modeling our ANN. 
We convert to a series of numeric ones and zeros which can be accepted by the Keras ANN modeling functions

```{r}
# Response variables for training and testing sets
y_train_vec <- ifelse(pull(train_tbl, Churn) == "Yes", 1, 0)
y_test_vec  <- ifelse(pull(test_tbl, Churn) == "Yes", 1, 0)
```

#We’ll build a three layer MLP with Keras#. 

*Initialize a sequential model:* The first step is to initialize a sequential model with keras_model_sequential(), which is the beginning of our Keras model. 
The sequential model is composed of a linear stack of layers.

*Apply layers to the sequential model:* 
Layers consist of the input layer, hidden layers and an output layer. 
The input layer is the data and provided it’s formatted correctly there’s nothing more to discuss. 
The hidden layers and output layers are what controls the ANN inner workings.

*Hidden Layers:* 
Hidden layers form the neural network nodes that enable non-linear activation using weights. 
The hidden layers are created using **layer_dense()**. 

We’ll add two hidden layers. We’ll apply units = 16, which is the number of nodes. 
We’ll select kernel_initializer = "uniform" and activation = "relu" for both layers. 
The first layer needs to have the input_shape = 35, which is the number of columns in the training set. 

*Dropout Layers:* 
Dropout layers are used to control overfitting. 
This eliminates weights below a cutoff threshold to prevent low weights from overfitting the layers. 
We use the layer_dropout() function add two drop out layers with rate = 0.10 to remove weights below 10%.

*Output Layer:* 
The output layer specifies the shape of the output and the method of assimilating the learned information. 
The output layer is applied using the layer_dense(). 

For binary values, the shape should be units = 1. 
For multi-classification, the units should correspond to the number of classes. 
We set the kernel_initializer = "uniform" and the activation = "sigmoid" (common for binary classification).

*Compile the model:*
The last step is to compile the model with **compile()**.

We’ll use optimizer = **"adam"**, which is one of the most popular optimization algorithms. 
We select loss = **"binary_crossentropy"** since this is a binary classification problem. 

We’ll select **metrics = c("accuracy")** to be evaluated during training and testing. 


```{r}
# Building our Artificial Neural Network
model_keras <- keras_model_sequential()

model_keras %>% 
    # First hidden layer
    layer_dense(
        units              = 16, 
        kernel_initializer = "uniform", 
        activation         = "relu", 
        input_shape        = ncol(x_train_tbl)) %>% 
    # Dropout to prevent overfitting
    layer_dropout(rate = 0.1) %>%
    # Second hidden layer
    layer_dense(
        units              = 16, 
        kernel_initializer = "uniform", 
        activation         = "relu") %>% 
    # Dropout to prevent overfitting
    layer_dropout(rate = 0.1) %>%
    # Output layer
    layer_dense(
        units              = 1, 
        kernel_initializer = "uniform", 
        activation         = "sigmoid") %>% 
    # Compile ANN
    compile(
        optimizer = 'adam',
        loss      = 'binary_crossentropy',
        metrics   = c('accuracy')
    )
model_keras
```

## Fitting the Model 

We use the fit() function to run the ANN on our training data. 
The object is our model, and x and y are our training data in matrix and numeric vector forms, respectively. 
**batch_size = 50** sets the number samples per gradient update within each epoch. 
**epochs = 35** to control the number training cycles. 
**validation_split = 0.30** to include 30% of the data for model validation, which prevents overfitting. 

```{r}
# Fit the keras model to the training data
fit_keras <- fit(
    object           = model_keras, 
    x                = as.matrix(x_train_tbl), 
    y                = y_train_vec,
    batch_size       = 50, 
    epochs           = 35,
    validation_split = 0.30
    )
```
```{r}
# Print the final model
fit_keras
```

```{r}
# Plot the training/validation history of our Keras model
plot(fit_keras) +
    theme_tq() +
    scale_color_tq() +
    scale_fill_tq() +
    labs(title = "Neural Net Training Results")
```

We’ve got a good model based on the validation accuracy. Now let’s make some predictions from our keras model on the test data set, which was unseen during modeling (we use this for the true performance assessment). 
We have two functions to generate predictions:

**predict_classes:** Generates class values as a matrix of ones and zeros. Since we are dealing with binary classification, we’ll convert the output to a vector.
**predict_proba:** Generates the class probabilities as a numeric matrix indicating the probability of being a class. Again, we convert to a numeric vector because there is only one column output.

```{r}
# Predicted Class
yhat_keras_class_vec <- predict_classes(object = model_keras, x = as.matrix(x_test_tbl)) %>%
    as.vector()

# Predicted Class Probability
yhat_keras_prob_vec  <- predict_proba(object = model_keras, x = as.matrix(x_test_tbl)) %>%
    as.vector()
```

## Yardstick Package

```{r}
# Format test data and predictions for yardstick metrics
estimates_keras_tbl <- tibble(
    truth      = as.factor(y_test_vec) %>% fct_recode(yes = "1", no = "0"),
    estimate   = as.factor(yhat_keras_class_vec) %>% fct_recode(yes = "1", no = "0"),
    class_prob = yhat_keras_prob_vec
)

estimates_keras_tbl
```

```{r}
options(yardstick.event_first = FALSE)
# Confusion Table
estimates_keras_tbl %>% conf_mat(truth, estimate)
# Accuracy
estimates_keras_tbl %>% metrics(truth, estimate)
# AUC
estimates_keras_tbl %>% roc_auc(truth, class_prob)
# Precision and Recall
tibble(
    precision = estimates_keras_tbl %>% precision(truth, estimate),
    recall    = estimates_keras_tbl %>% recall(truth, estimate)
)
```


#Visualization of the MODEL WITH LIME

LIME stands for Local Interpretable Model-agnostic Explanations, and is a method for explaining black-box machine learning model classifiers.

```{r}
class(model_keras)
```

```{r}
# Setup lime::model_type() function for keras
model_type.keras.models.Sequential <- function(x, ...) {
    return("classification")
}
```

```{r}
# Setup lime::predict_model() function for keras
predict_model.keras.models.Sequential <- function(x, newdata, type, ...) {
    pred <- predict_proba(object = x, x = as.matrix(newdata))
    return(data.frame(Yes = pred, No = 1 - pred))
}
```


```{r}
# Test our predict_model() function
predict_model(x = model_keras, newdata = x_test_tbl, type = 'raw') %>%
    tibble::as_tibble()
```
```{r}
# Run lime() on training set
explainer <- lime::lime(
    x              = x_train_tbl, 
    model          = model_keras, 
    bin_continuous = FALSE)
```

```{r}
# Run explain() on explainer
explanation <- lime::explain(
    x_test_tbl[1:10,], 
    explainer    = explainer, 
    n_labels     = 1, 
    n_features   = 4,
    kernel_width = 0.5)
```


#FEATURE IMPORTANCE VISUALIZATION

The payoff for the work we put in using LIME is this feature importance plot. This allows us to visualize each of the first ten cases (observations) from the test data. The top four features for each case are shown. Note that they are not the same for each case. The green bars mean that the feature supports the model conclusion, and the red bars contradict. A few important features based on frequency in first ten cases:

#Tenure (7 cases)
#Senior Citizen (5 cases)
#Online Security (4 cases)


```{r}
plot_features(explanation) +
    labs(title = "LIME Feature Importance Visualization",
         subtitle = "Hold Out (Test) Set, First 10 Cases Shown")

plot_explanations(explanation) +
    labs(title = "LIME Feature Importance Heatmap",
         subtitle = "Hold Out (Test) Set, First 10 Cases Shown")
```

# Feature Investigation - Visualization withj LIME 

We can investigate features that are most frequent in the LIME feature importance visualization :

#Tenure (7/10 LIME Cases, Highly Correlated)
#Contract (Highly Correlated)
#Internet Service (Highly Correlated)
#Payment Method (Highly Correlated)
#Senior Citizen (5/10 LIME Cases)
#Online Security (4/10 LIME Cases)

```{r}
# Tenure
churn_data_raw %>%
ggplot(aes(x = Churn, y = tenure)) +
geom_jitter(alpha = 0.25, color = palette_light()[[6]]) +
geom_violin(alpha = 0.6, fill = palette_light()[[1]]) +
theme_tq() +
labs(
title = "Tenure",
subtitle = "Customers with lower tenure are more likely to leave"
)

# Contract
churn_data_raw %>%
mutate(Churn = ifelse(Churn == "Yes", 1, 0)) %>%
ggplot(aes(x = as.factor(Contract), y = Churn)) +
geom_jitter(alpha = 0.25, color = palette_light()[[6]]) +
geom_violin(alpha = 0.6, fill = palette_light()[[1]]) +
theme_tq() +
labs(
title = "Contract Type",
subtitle = "Two and one year contracts much less likely to leave",
x = "Contract"
)

# Internet Service
churn_data_raw %>%
mutate(Churn = ifelse(Churn == "Yes", 1, 0)) %>%
ggplot(aes(x = as.factor(InternetService), y = Churn)) +
geom_jitter(alpha = 0.25, color = palette_light()[[6]]) +
geom_violin(alpha = 0.6, fill = palette_light()[[1]]) +
theme_tq() +
labs(
title = "Internet Service",
subtitle = "Fiber optic more likely to leave",
x = "Internet Service"
)

# Payment Method
churn_data_raw %>%
mutate(Churn = ifelse(Churn == "Yes", 1, 0)) %>%
ggplot(aes(x = as.factor(PaymentMethod), y = Churn)) +
geom_jitter(alpha = 0.25, color = palette_light()[[6]]) +
geom_violin(alpha = 0.6, fill = palette_light()[[1]]) +
theme_tq() +
labs(
title = "Payment Method",
subtitle = "Electronic check more likely to leave",
x = "Payment Method"
)

# Senior Citizen
churn_data_raw %>%
mutate(Churn = ifelse(Churn == "Yes", 1, 0)) %>%
ggplot(aes(x = as.factor(SeniorCitizen), y = Churn)) +
geom_jitter(alpha = 0.25, color = palette_light()[[6]]) +
geom_violin(alpha = 0.6, fill = palette_light()[[1]]) +
theme_tq() +
labs(
title = "Senior Citizen",
subtitle = "Non-senior citizens less likely to leave",
x = "Senior Citizen (Yes = 1)"
)

# Online Security
churn_data_raw %>%
mutate(Churn = ifelse(Churn == "Yes", 1, 0)) %>%
ggplot(aes(x = OnlineSecurity, y = Churn)) +
geom_jitter(alpha = 0.25, color = palette_light()[[6]]) +
geom_violin(alpha = 0.6, fill = palette_light()[[1]]) +
theme_tq() +
labs(
title = "Online Security",
subtitle = "Customers without online security are more likely to leave"
)
```


